Here I use two different datasets in this experimentation: Diabetes dataset and California housing dataset from sklearn toy datasets and real world datasets respectively.
#import the diabetes dataset from scikit-learn toy datasets
from sklearn.datasets import load_diabetes
#import train_test_split to split the dataset into train and test set
from sklearn.model_selection import train_test_split
#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#storing the dataset in a variable
dataset = load_diabetes()
#what's inside the dataset?
dataset.keys()
dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])
#description of the dataset e.g. number of instances and attributes and information about the attributes etc.
print(dataset.DESCR)
.. _diabetes_dataset:
Diabetes dataset
----------------
Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.
**Data Set Characteristics:**
:Number of Instances: 442
:Number of Attributes: First 10 columns are numeric predictive values
:Target: Column 11 is a quantitative measure of disease progression one year after baseline
:Attribute Information:
- age age in years
- sex
- bmi body mass index
- bp average blood pressure
- s1 tc, total serum cholesterol
- s2 ldl, low-density lipoproteins
- s3 hdl, high-density lipoproteins
- s4 tch, total cholesterol / HDL
- s5 ltg, possibly log of serum triglycerides level
- s6 glu, blood sugar level
Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).
Source URL:
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html
For more information see:
Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.
(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)
#min and max values of each attribute and the target
for i in range(10):
print(f"column {i} min: {np.min(dataset.data[:, i])} and max {np.max(dataset.data[:, i])}")
reshaped_target = dataset.target.reshape((442, 1))
print(reshaped_target.shape)
print(np.min(reshaped_target), np.max(reshaped_target))
column 0 min: -0.107225631607358 and max 0.110726675453815 column 1 min: -0.044641636506989 and max 0.0506801187398187 column 2 min: -0.0902752958985185 and max 0.17055522598066 column 3 min: -0.112399602060758 and max 0.132044217194516 column 4 min: -0.126780669916514 and max 0.153913713156516 column 5 min: -0.115613065979398 and max 0.198787989657293 column 6 min: -0.10230705051742 and max 0.181179060397284 column 7 min: -0.076394503750001 and max 0.185234443260194 column 8 min: -0.126097385560409 and max 0.133598980013008 column 9 min: -0.137767225690012 and max 0.135611830689079 (442, 1) 25.0 346.0
#scaling the features and the target to the range [0, 1]
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
scaled_data = min_max_scaler.fit_transform(dataset.data)
scaled_target = min_max_scaler.fit_transform(reshaped_target)
#store the dataset as a pandas dataframe
df = pd.DataFrame(data = scaled_data, columns = dataset.feature_names)
df.head()
| age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.666667 | 1.0 | 0.582645 | 0.549296 | 0.294118 | 0.256972 | 0.207792 | 0.282087 | 0.562217 | 0.439394 |
| 1 | 0.483333 | 0.0 | 0.148760 | 0.352113 | 0.421569 | 0.306773 | 0.623377 | 0.141044 | 0.222443 | 0.166667 |
| 2 | 0.883333 | 1.0 | 0.516529 | 0.436620 | 0.289216 | 0.258964 | 0.246753 | 0.282087 | 0.496584 | 0.409091 |
| 3 | 0.083333 | 0.0 | 0.301653 | 0.309859 | 0.495098 | 0.447211 | 0.233766 | 0.423131 | 0.572936 | 0.469697 |
| 4 | 0.516667 | 0.0 | 0.206612 | 0.549296 | 0.465686 | 0.417331 | 0.389610 | 0.282087 | 0.362369 | 0.333333 |
#append the target variable
df["Y"] = scaled_target
df.head()
| age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | Y | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.666667 | 1.0 | 0.582645 | 0.549296 | 0.294118 | 0.256972 | 0.207792 | 0.282087 | 0.562217 | 0.439394 | 0.392523 |
| 1 | 0.483333 | 0.0 | 0.148760 | 0.352113 | 0.421569 | 0.306773 | 0.623377 | 0.141044 | 0.222443 | 0.166667 | 0.155763 |
| 2 | 0.883333 | 1.0 | 0.516529 | 0.436620 | 0.289216 | 0.258964 | 0.246753 | 0.282087 | 0.496584 | 0.409091 | 0.361371 |
| 3 | 0.083333 | 0.0 | 0.301653 | 0.309859 | 0.495098 | 0.447211 | 0.233766 | 0.423131 | 0.572936 | 0.469697 | 0.563863 |
| 4 | 0.516667 | 0.0 | 0.206612 | 0.549296 | 0.465686 | 0.417331 | 0.389610 | 0.282087 | 0.362369 | 0.333333 | 0.342679 |
#let's check descriptive statistics of the dataset e.g. mean, max, min, std etc. of each feature vector
df.describe()
| age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | Y | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 442.000000 | 442.000000 | 442.000000 | 442.000000 | 442.000000 | 442.000000 | 442.000000 | 442.000000 | 442.000000 | 442.000000 | 442.000000 |
| mean | 0.491968 | 0.468326 | 0.346107 | 0.459818 | 0.451668 | 0.367725 | 0.360889 | 0.291996 | 0.485557 | 0.503942 | 0.396054 |
| std | 0.218484 | 0.499561 | 0.182567 | 0.194806 | 0.169647 | 0.151460 | 0.167977 | 0.182010 | 0.183364 | 0.174187 | 0.240165 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.320833 | 0.000000 | 0.214876 | 0.309859 | 0.329657 | 0.271165 | 0.237013 | 0.141044 | 0.357528 | 0.382576 | 0.193146 |
| 50% | 0.516667 | 0.000000 | 0.318182 | 0.436620 | 0.436275 | 0.355578 | 0.337662 | 0.282087 | 0.478057 | 0.500000 | 0.359813 |
| 75% | 0.666667 | 1.000000 | 0.465909 | 0.605634 | 0.552696 | 0.462649 | 0.464286 | 0.423131 | 0.610446 | 0.606061 | 0.580997 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
#before splitting the dataset into train and test set randomize data points
from sklearn.utils import shuffle
df = shuffle(df, random_state = 9)
df.head()
| age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | Y | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 381 | 0.166667 | 1.0 | 0.004132 | 0.154930 | 0.299020 | 0.285857 | 0.246753 | 0.282087 | 0.435853 | 0.303030 | 0.246106 |
| 392 | 0.300000 | 1.0 | 0.231405 | 0.450704 | 0.529412 | 0.484064 | 0.402597 | 0.282087 | 0.326986 | 0.742424 | 0.289720 |
| 155 | 0.366667 | 1.0 | 0.578512 | 0.661972 | 0.754902 | 0.642430 | 0.350649 | 0.423131 | 0.631167 | 0.681818 | 0.501558 |
| 439 | 0.683333 | 1.0 | 0.285124 | 0.530516 | 0.318627 | 0.323705 | 0.272727 | 0.249647 | 0.305040 | 0.560606 | 0.333333 |
| 331 | 0.866667 | 1.0 | 0.247934 | 0.309859 | 0.200980 | 0.220120 | 0.220779 | 0.282087 | 0.326986 | 0.484848 | 0.542056 |
#check if there's any missing value in the dataset
df.isnull().sum()
age 0 sex 0 bmi 0 bp 0 s1 0 s2 0 s3 0 s4 0 s5 0 s6 0 Y 0 dtype: int64
#a short summary of the dataset
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 442 entries, 381 to 382 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 442 non-null float64 1 sex 442 non-null float64 2 bmi 442 non-null float64 3 bp 442 non-null float64 4 s1 442 non-null float64 5 s2 442 non-null float64 6 s3 442 non-null float64 7 s4 442 non-null float64 8 s5 442 non-null float64 9 s6 442 non-null float64 10 Y 442 non-null float64 dtypes: float64(11) memory usage: 41.4 KB
#data distribution of each attribute
sns.set(rc = {"figure.figsize": (15, 12)})
fig, axes = plt.subplots(5, 2)
count = 0
for i in range(5):
for j in range(2):
plt.subplots_adjust(hspace = 0.5)
sns.histplot(df[df.keys()[count]], ax = axes[i, j], kde = True)
#kde is kernel density estimation which estimates the pdf of a continuous random variable
count += 1
plt.suptitle("Data distribution of each of the variable excluding the target")
plt.show()
sns.set(rc = {"figure.figsize": (15, 5)})
sns.histplot(df["Y"], kde = True)
plt.show()
#pairwise relationship between the variables
g = sns.PairGrid(df)
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
plt.show()
#correlation matrix to measure the linear relationship between variables
sns.set(rc = {"figure.figsize": (15, 10)})
corr_matrix = df.corr()
sns.heatmap(data = corr_matrix, annot = True)
plt.show()
#prints the values inside the squares if annot is True
#the correlation coefficient ranges from -1 to 1
#if the value is close to 1, there's a strong positive correlation between the variables
#there's a strong negative correlation if the value is close to -1
We should choose the variables that are highly correlated with the target variable. But always check for multi-co-linearity while choosing the attributes. If two features are highly correlated with the target and those two are correlated with each other too then choose one of those, not both. In this dataset two features are highly correlated (>0.5) with the target: "bmi" and "s5". Correlation between "bmi" and "s5" is 0.45(<0.5) so I consider each of the two variables.
fig, axes = plt.subplots(5, 2)
count = 0
for i in range(5):
for j in range(2):
plt.subplots_adjust(hspace = 0.5)
sns.boxplot(x = df[df.keys()[count]], ax = axes[i, j])
count += 1
plt.suptitle("Box plot of each variable")
plt.show()
sns.set(rc = {"figure.figsize": (15, 5)})
sns.boxplot(x = df["Y"])
plt.show()
#"bmi" and "s5" are the best options since both of the feature is highly correlated with the target variable "Y"
#the correlation between "bmi" and "Y" is 0.59 and correlation between "s5" and "Y" is 0.57 (strong positive correlation)
fig, axes = plt.subplots(3)
sns.set(rc = {"figure.figsize": (25, 12)})
plt.subplots_adjust(hspace = 0.3)
sns.scatterplot(x = df["bmi"], y = df["Y"], ax = axes[0]).set(title = "bmi vs. Y")
sns.scatterplot(x = df['s5'], y = df["Y"], ax = axes[1]).set(title = "s5 vs. Y")
sns.scatterplot(x = df['sex'], y = df["Y"], ax = axes[2]).set(title = "sex vs. Y")
plt.suptitle("bmi vs. Y: corr coeff 0.59, s5 vs. Y: corr coeff 0.57 and sex vs. Y corr coeff 0.043 data points")
plt.show()
X = pd.concat((df["bmi"], df["s5"]), axis = 1)
X.head()
| bmi | s5 | |
|---|---|---|
| 381 | 0.004132 | 0.435853 |
| 392 | 0.231405 | 0.326986 |
| 155 | 0.578512 | 0.631167 |
| 439 | 0.285124 | 0.305040 |
| 331 | 0.247934 | 0.326986 |
y = pd.DataFrame(df["Y"])
y.tail()
| Y | |
|---|---|
| 56 | 0.084112 |
| 438 | 0.246106 |
| 126 | 0.230530 |
| 348 | 0.383178 |
| 382 | 0.333333 |
#split the dataset into train and test set
X_train0, X_test0, y_train0, y_test0 = train_test_split(X, y, test_size = 0.2)
#test_size = 0.2 means 20% of the data will be used for testing
print(X_train0.shape, y_train0.shape)
print(X_test0.shape, y_test0.shape)
(353, 2) (353, 1) (89, 2) (89, 1)
from sklearn.datasets import fetch_california_housing
dataset = fetch_california_housing()
dataset.keys()
dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])
print(dataset.DESCR)
.. _california_housing_dataset:
California Housing dataset
--------------------------
**Data Set Characteristics:**
:Number of Instances: 20640
:Number of Attributes: 8 numeric, predictive attributes and the target
:Attribute Information:
- MedInc median income in block group
- HouseAge median house age in block group
- AveRooms average number of rooms per household
- AveBedrms average number of bedrooms per household
- Population block group population
- AveOccup average number of household members
- Latitude block group latitude
- Longitude block group longitude
:Missing Attribute Values: None
This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html
The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).
This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bureau publishes sample data (a block group typically has a population
of 600 to 3,000 people).
An household is a group of people residing within a home. Since the average
number of rooms and bedrooms in this dataset are provided per household, these
columns may take surpinsingly large values for block groups with few households
and many empty houses, such as vacation resorts.
It can be downloaded/loaded using the
:func:`sklearn.datasets.fetch_california_housing` function.
.. topic:: References
- Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
Statistics and Probability Letters, 33 (1997) 291-297
for i in range(8):
print(f"column {i} min: {np.min(dataset.data[:, i])} and max {np.max(dataset.data[:, i])}")
reshaped_target = dataset.target.reshape((20640, 1))
print(reshaped_target.shape)
print(np.min(reshaped_target), np.max(reshaped_target))
column 0 min: 0.4999 and max 15.0001 column 1 min: 1.0 and max 52.0 column 2 min: 0.8461538461538461 and max 141.9090909090909 column 3 min: 0.3333333333333333 and max 34.06666666666667 column 4 min: 3.0 and max 35682.0 column 5 min: 0.6923076923076923 and max 1243.3333333333333 column 6 min: 32.54 and max 41.95 column 7 min: -124.35 and max -114.31 (20640, 1) 0.14999 5.00001
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
scaled_data = min_max_scaler.fit_transform(dataset.data)
scaled_target = min_max_scaler.fit_transform(reshaped_target)
df = pd.DataFrame(data = scaled_data, columns = dataset.feature_names)
df["Y"] = scaled_target
df.head()
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | Y | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.539668 | 0.784314 | 0.043512 | 0.020469 | 0.008941 | 0.001499 | 0.567481 | 0.211155 | 0.902266 |
| 1 | 0.538027 | 0.392157 | 0.038224 | 0.018929 | 0.067210 | 0.001141 | 0.565356 | 0.212151 | 0.708247 |
| 2 | 0.466028 | 1.000000 | 0.052756 | 0.021940 | 0.013818 | 0.001698 | 0.564293 | 0.210159 | 0.695051 |
| 3 | 0.354699 | 1.000000 | 0.035241 | 0.021929 | 0.015555 | 0.001493 | 0.564293 | 0.209163 | 0.672783 |
| 4 | 0.230776 | 1.000000 | 0.038534 | 0.022166 | 0.015752 | 0.001198 | 0.564293 | 0.209163 | 0.674638 |
df.describe()
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | Y | |
|---|---|---|---|---|---|---|---|---|---|
| count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
| mean | 0.232464 | 0.541951 | 0.032488 | 0.022629 | 0.039869 | 0.001914 | 0.328572 | 0.476125 | 0.395579 |
| std | 0.131020 | 0.246776 | 0.017539 | 0.014049 | 0.031740 | 0.008358 | 0.226988 | 0.199555 | 0.237928 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.142308 | 0.333333 | 0.025482 | 0.019943 | 0.021974 | 0.001398 | 0.147715 | 0.253984 | 0.215671 |
| 50% | 0.209301 | 0.549020 | 0.031071 | 0.021209 | 0.032596 | 0.001711 | 0.182784 | 0.583665 | 0.339588 |
| 75% | 0.292641 | 0.705882 | 0.036907 | 0.022713 | 0.048264 | 0.002084 | 0.549416 | 0.631474 | 0.514897 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
df = shuffle(df, random_state = 9)
df.head()
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | Y | |
|---|---|---|---|---|---|---|---|---|---|
| 20558 | 0.279493 | 0.176471 | 0.033123 | 0.019920 | 0.029990 | 0.001727 | 0.651435 | 0.253984 | 0.287424 |
| 11430 | 0.369250 | 0.411765 | 0.039712 | 0.019421 | 0.052356 | 0.001923 | 0.119022 | 0.635458 | 0.549484 |
| 2656 | 0.099433 | 0.607843 | 0.048212 | 0.056541 | 0.012080 | 0.001311 | 0.822529 | 0.009960 | 0.125981 |
| 14267 | 0.075406 | 0.705882 | 0.022346 | 0.021149 | 0.069677 | 0.002787 | 0.015940 | 0.721116 | 0.147012 |
| 3684 | 0.183197 | 0.686275 | 0.026569 | 0.022013 | 0.033017 | 0.002579 | 0.177471 | 0.595618 | 0.332784 |
df.isnull().sum()
MedInc 0 HouseAge 0 AveRooms 0 AveBedrms 0 Population 0 AveOccup 0 Latitude 0 Longitude 0 Y 0 dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 20640 entries, 20558 to 501 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MedInc 20640 non-null float64 1 HouseAge 20640 non-null float64 2 AveRooms 20640 non-null float64 3 AveBedrms 20640 non-null float64 4 Population 20640 non-null float64 5 AveOccup 20640 non-null float64 6 Latitude 20640 non-null float64 7 Longitude 20640 non-null float64 8 Y 20640 non-null float64 dtypes: float64(9) memory usage: 1.6 MB
sns.set(rc = {"figure.figsize": (15, 12)})
fig, axes = plt.subplots(4, 2)
count = 0
for i in range(4):
for j in range(2):
plt.subplots_adjust(hspace = 0.5)
sns.histplot(df[df.keys()[count]], ax = axes[i, j], kde = True)
#kde is kernel density estimation which estimates the pdf of a continuous random variable
count += 1
plt.suptitle("Data distribution of each of the variable excluding the target")
plt.show()
sns.set(rc = {"figure.figsize": (15, 5)})
sns.histplot(df["Y"], kde = True)
<AxesSubplot:xlabel='Y', ylabel='Count'>
g = sns.PairGrid(df)
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
plt.show()
sns.set(rc = {"figure.figsize": (15, 10)})
corr_matrix = df.corr()
sns.heatmap(data = corr_matrix, annot = True)
plt.show()
In this dataset one feature is highly correlated (>0.5) with the target: "MedInc". Correlation between "MedInc" and "Y" is 0.69 (>0.5) so I consider this variable. Other variables are not strongly correlated with the target (<0.2). For ease in experimentation with Gradient Descent I decided not to use other variables since those are not highly correlated with the target and number of instances are high, so it's better for GD if the number of dimensions are less.
fig, axes = plt.subplots(4, 2)
count = 0
for i in range(4):
for j in range(2):
plt.subplots_adjust(hspace = 0.5)
sns.boxplot(x = df[df.keys()[count]], ax = axes[i, j])
count += 1
plt.suptitle("Box plot of each variable")
plt.show()
sns.set(rc = {"figure.figsize": (15, 5)})
sns.boxplot(x = df["Y"])
plt.show()
fig, axes = plt.subplots(3)
sns.set(rc = {"figure.figsize": (25, 12)})
plt.subplots_adjust(hspace = 0.3)
sns.scatterplot(x = df["MedInc"], y = df["Y"], ax = axes[0]).set(title = "MedInc vs. Y")
sns.scatterplot(x = df['AveRooms'], y = df["Y"], ax = axes[1]).set(title = "AveRooms vs. Y")
sns.scatterplot(x = df['Latitude'], y = df["Y"], ax = axes[2]).set(title = "Latitude vs. Y")
plt.suptitle("MedInc vs. Y: corr coeff 0.69, AveRooms vs. Y: corr coeff 0.15 and Latitude vs. Y corr coeff 0.14 data points")
plt.show()
X = pd.DataFrame(df["MedInc"])
y = pd.DataFrame(df["Y"])
X.head()
| MedInc | |
|---|---|
| 20558 | 0.279493 |
| 11430 | 0.369250 |
| 2656 | 0.099433 |
| 14267 | 0.075406 |
| 3684 | 0.183197 |
y.tail()
| Y | |
|---|---|
| 4532 | 0.256908 |
| 4673 | 0.956699 |
| 5014 | 0.201238 |
| 9979 | 0.171960 |
| 501 | 0.188249 |
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size = 0.1)
#test set size is 0.1 or 10% because we have more data in this case so it's better to keep more data for training
print(X_train1.shape, y_train1.shape)
print(X_test1.shape, y_test1.shape)
(18576, 1) (18576, 1) (2064, 1) (2064, 1)
#TBC